#Alexander F. Gazmararian
#afg2@princeton.edu

#load packages
library(tidyverse)
library(tidycensus)
library(kableExtra)
library(here)

#load naics data
naics <- read.csv("https://data.bls.gov/cew/doc/titles/industry/industry_titles.csv")
#load CBP data
temp <- tempfile()
download.file("https://www2.census.gov/programs-surveys/cbp/datasets/2020/cbp20co.zip", temp)
cbp <- read.csv(unz(temp, "cbp20co.txt"), header = TRUE)
# subset to relevant industries
#oil and gas
##213111  Drilling oil and gas wells
##213112	Support Activities for Oil and Gas Operations
##211120	Crude Petroleum Extraction
##211130	Natural Gas Extraction
#coal
##213113	Support Activities for Coal Mining
##212114	Surface Coal Mining
##212115	Underground Coal Mining
##212111  Bituminous Coal and Lignite Surface Mining
##212112 - Bituminous Coal Underground Mining
##213113 - Support Activities for Coal Mining
##212113 Anthracite Mining
##221112 fossil fuel electric power generation
df <-
  cbp %>%
  dplyr::mutate(
    industry = dplyr::case_when(
      naics %in% c(
        "213111", #drilling oil and gas wells
        "213112", #support activities for oil and gas operations
        "211120", #crude petroleum extraction
        "211130", #natural gas extraction
        "237120", #oil and gas pipeline and related structures construction
        "486110", #pipeline transportation of crude oil 
        "486210", #pipeline transportation of natural gas
        "486990", #All Other Pipeline Transportation
        "486910", #pipeline transportation of refined petroleum products
        "424710", #petroleum bulk stations and terminals
        "424720", #petroleum and petroleum products merchant wholesalers (except bulk stations and terminals)
        "324110", #petroleum refineries
        "324121", #asphalt paving mixture and block manufacturing
        "324122", #asphalt shingle and coating materials manufacturing
        "324191", #petroleum lubricating oil and grease manufacturing
        "324199", #all other petroleum and coal product manufacturing
        "221210", #natural gas distribution
        "333131", #Mining Machinery and Equipment Manufacturing
        "333132", #Oil and Gas Field Machinery and Equipment Manufacturing
        "332420", #Metal Tank (Heavy Gauge) Manufacturing
        "454310" #Fuel Dealers
      ) ~ "oilgas",
      naics %in% c(
        "212114", #surface coal mining
        "212115", #underground coal mining
        "213113", #support activities for coal mining
        "212111", #Bituminous Coal and Lignite Surface Mining
        "212112", #Bituminous Coal Underground Mining
        "212113" #Anthracite Mining
      ) ~ "coal",
      naics %in% c("221112") ~ "power",
      !grepl("-", naics) & !grepl("/", naics) ~ "other"
    )
  )
df <- subset(df, !is.na(industry))
#aggregate to the county level
agg <-
  df %>%
  dplyr::group_by(fipstate, fipscty, industry) %>%
  dplyr::summarise(emp = sum(emp))
#create fixed fips codes
agg <-
  agg %>%
  dplyr::mutate(
    fips = dplyr::case_when(
      nchar(fipscty) == 1 ~ paste0(fipstate, "00", fipscty),
      nchar(fipscty) == 2 ~ paste0(fipstate, "0", fipscty),
      nchar(fipscty) == 3 ~ paste0(fipstate, fipscty)
    ),
    fips = as.numeric(fips)
  )
agg <- subset(agg, select = -c(fipstate, fipscty))
#flip
agg <-
  agg %>%
  tidyr::pivot_wider(names_from = industry, values_from = emp, values_fill = 0)
#create percent of industry in each county
fossil_share <-
  agg %>%
  dplyr::mutate(coal_share = coal / (coal + oilgas + power + other),
                oil_share = oilgas / (coal + oilgas + power + other),
                power_share = power / (coal + oilgas + power + other))
#fossil_share$fips <- as.numeric(fossil_share$fips)
fossil_share <- subset(fossil_share, !is.na(fips))
dim(fossil_share)
# download fip codes
fips <- read.csv("https://raw.githubusercontent.com/kjhealy/fips-codes/master/county_fips_master.csv")
fips <- subset(fips, select = c(fips, county_name, state_name))
#merge with fips
fossil_share <- merge(fossil_share, fips, by = "fips", all.x = TRUE)
fossil_share <- subset(fossil_share, !is.na(county_name))
dim(fossil_share)
# load census data
##get variables
vars <- tidycensus::load_variables(2010, "sf1")
census_results <-
  tidycensus::get_decennial(
    year = 2010,
    geography = "county",
    variables = c(
      pop = "P001001", #population
      age = "P013001", #median age for both sexes
      female = "P012026", #total female
      male = "P012002", # total male
      white = "P003002", # total white alone
      black = "P003003", #total black alone
      rural = "P002005" #total rural
    )
  )
#pivot wider
census_results <-
  census_results %>%
  tidyr::pivot_wider(values_from = value, names_from = variable)
#create percent
#pair with fossil share
rep <- merge(census_results, fossil_share, by.x = "GEOID", by.y = "fips", all = TRUE)
#compare counties
rep <-
  rep %>%
  dplyr::mutate(
    sample = dplyr::case_when(
      GEOID %in% c(42059, 54103, 54051, 54069) ~ "Study Area",
      # 54061, 42051, 42125
      coal_share > 0.05 | oil_share > 0.05 ~ "Energy Community",
      #coal_share > 0.1 & oil_share == 0 ~ "Coal County",
      #oil_share > 0.1 & coal_share == 0 ~ "Oil and Gas County",
      #coal_share >= 0.1 & oil_share >= 0.1 ~ "Coal, Oil and Gas County",
      coal_share <= 0.05 & oil_share <= 0.05 ~ "Non-Energy County"
    )
  ) %>%
  dplyr::filter(!is.na(sample))
#t-test by sample type
calc_t <- function(var) {
  t.out <- t.test(subset(rep, sample == "Study Area")[[var]], subset(rep, sample == "Energy Community")[[var]], alternative = "two.sided")
  t.out$statistic
  t.out$p.value
  t.out$estimate
  national.mean <- mean(subset(rep, sample == "Non-Energy County")[[var]], na.rm = TRUE)
  df <-
    data.frame(
      Measure = var,
      `Non-Energy County` = national.mean,
      `Study Area` = t.out$estimate[[1]],
      `Energy Community` = t.out$estimate[[2]],
      `t` = t.out$statistic,
      `p-value` = t.out$p.value
    )
  return(df)
}
var_list <- c("pop", "age", "female", "male", "white", "black", "rural", "coal_share", "oil_share")
t.table <- lapply(var_list, calc_t)
t.table <- do.call("rbind", t.table)
names(t.table) <- c("", "Non-Energy", "Study", "Energy", "$t$-stat", "$p$-value")
t.table[, 1] <- c("Population", "Median Age", "Female", "Male", "White", "Black", "Rural", "Coal Employment", "Oil/Gas Employment")
kableExtra::kbl(t.table, digits=2, row.names = FALSE, booktabs = TRUE, linesep = "",
                format = "latex", escape = FALSE,
                caption = "Study site representativeness of American fossil fuel counties. \\label{tab:ff_rep}", position = "h") %>%
  kableExtra::kable_styling(position = "center") %>%
  kableExtra::add_footnote(
    paste(
      "Notes:",
      "Demographic data from the 2010 Census.",
      "Fossil fuel employment data from the 2020 Census County Business Patterns survey.",
      "NAICS codes for oil/gas include: 213111, 213112, 211120, 211130, 237120, 486110, 486210",
      "486990, 486910, 424710, 424720, 324110, 324121, 324122, 324191, 324199, 221210",
      "333131, 333132, 332420, and 454310.",
      "NAICS codes for coal include: 212114, 212115, 213113, 212111, 212112, and 212113.",
      "Two-sided $t$-tests reported to assess balance."
    ),
    threeparttable = TRUE,
    notation = "none",
    escape = FALSE
  ) %>%
  cat(., file = here("output", "tables", "si_tab_3.3.txt"))
